library(tidyr)
## Warning: package 'tidyr' was built under R version 3.6.2
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.5 ✓ dplyr 1.0.4
## ✓ tibble 3.0.6 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## Warning: package 'ggplot2' was built under R version 3.6.2
## Warning: package 'tibble' was built under R version 3.6.2
## Warning: package 'readr' was built under R version 3.6.2
## Warning: package 'purrr' was built under R version 3.6.2
## Warning: package 'dplyr' was built under R version 3.6.2
## Warning: package 'forcats' was built under R version 3.6.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(purrr)
library(ggplot2)
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(corrplot)
## corrplot 0.90 loaded
library(RColorBrewer)
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.6.2
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(DataExplorer)
## Warning: package 'DataExplorer' was built under R version 3.6.2
setwd('/Users/britney/Desktop/STA 395/video-game-reviews/clean_data')
games = read.csv("clean_data_all.csv")
#summary(games)
plot_intro(games)
#Popular Genres
#Convert release date to year
games$year_released = year(as.character(games$release_date))
genres = games %>% select(year_released, starts_with("genres_"))
#Genres type by frequency, sorted in descending order
sort_genres = data.frame(colSums(genres[2:ncol(genres)])) %>% rownames_to_column() %>% rename(count = colSums.genres.2.ncol.genres..., genre_type = rowname) %>% arrange(-count)
#Get the top 20 most frequent genres
top20_genres = sort_genres[1:20,]
#Group top 20 by year and summarize the count
top20_genres_by_year = genres %>% select(year_released, top20_genres$genre_type) %>% group_by(year_released) %>% summarise(across(everything(), sum))
#Tidy data for plotly
top20_genres_by_year_tidy_count = top20_genres_by_year %>% gather(key = genre_type, value = count, 2:21)
top20_genres_by_year_tidy_count$genre_type = gsub("genres_", "", top20_genres_by_year_tidy_count$genre_type) #clean type name
#Interactive visualization with plotly (by count)
visual_top20_by_count = plot_ly(top20_genres_by_year_tidy_count, type = "bar", x = ~fct_reorder(genre_type, count), y = ~count, frame = ~year_released, showlegend = FALSE)
visual_top20_by_count
#Top 20 by percent
total_games_by_year = games %>% count(year_released) # calculate total number of games released per year
top20_genres_by_year_tidy_percent = top20_genres_by_year %>% left_join(total_games_by_year, by="year_released")
top20_genres_by_year_tidy_percent = top20_genres_by_year_tidy_percent %>% mutate(across(starts_with("genres_"), .fns = ~./n * 100))
#Tidy data for plotly
top20_genres_by_year_tidy_percent = top20_genres_by_year_tidy_percent %>% select(-n) %>% gather(key = genre_type, value = percent, starts_with("genres_"))
top20_genres_by_year_tidy_percent$genre_type = gsub("genres_", "", top20_genres_by_year_tidy_percent$genre_type) #clean type name
#Interactive visualization with plotly (by percent)
visual_top20_by_percent = plot_ly(top20_genres_by_year_tidy_percent, type = "bar", x = ~fct_reorder(genre_type, percent), y = ~percent, frame = ~year_released, showlegend = FALSE)
visual_top20_by_percent
#ESRB Rating Trends across year
ESRB = games %>% count(year_released, esrb_ratings) %>% na.omit()
ESRB_by_year = ESRB %>% group_by(year_released) %>% summarise(sum(n)) %>% rename(total_by_year = `sum(n)`)
ESRB = ESRB %>% left_join(ESRB_by_year, by="year_released")
ESRB$percent = ESRB$n / ESRB$total_by_year
ESRB = ESRB %>% select(-n, -total_by_year)
plot_ly(ESRB, type = "pie",labels = ~esrb_ratings, values = ~percent, frame = ~year_released,
textinfo = 'label+percent')
#Trends of ESRB content descriptions over years
esrb_content = games %>% select(year_released, starts_with("esrb_descs_"))
#Group by year and summarize the count
esrb_content_by_year = esrb_content %>% group_by(year_released) %>% summarise(across(everything(), sum))
#Tidy data for plotly
esrb_content_by_year_tidy_count = esrb_content_by_year %>% gather(key = esrb_content, value = count, 2:8)
esrb_content_by_year_tidy_count$esrb_content = gsub("esrb_descs_", "", esrb_content_by_year_tidy_count$esrb_content) #clean type name
esrb_content_by_year_tidy_count = esrb_content_by_year_tidy_count[!esrb_content_by_year_tidy_count$esrb_content == "missing",] #drop missing values
#Interactive visualization with plotly (by count)
visual_esrb_content_by_count = plot_ly(esrb_content_by_year_tidy_count, type = "bar", x = ~fct_reorder(esrb_content, count), y = ~count, frame = ~year_released, showlegend = FALSE)
visual_esrb_content_by_count
##missing data for esrb content descriptions from 2015-2021
#by percent
total_games_by_year = games %>% count(year_released) # calculate total number of games released per year
esrb_content_by_year_tidy_percent = esrb_content_by_year %>% left_join(total_games_by_year, by="year_released")
esrb_content_by_year_tidy_percent = esrb_content_by_year_tidy_percent %>% mutate(across(starts_with("esrb_descs_"), .fns = ~./n * 100))
#Tidy data for plotly
esrb_content_by_year_tidy_percent = esrb_content_by_year_tidy_percent %>% select(-n) %>% gather(key = esrb_content, value = percent, starts_with("esrb_descs_"))
esrb_content_by_year_tidy_percent$esrb_content = gsub("esrb_descs_", "", esrb_content_by_year_tidy_percent$esrb_content) #clean type name
#Interactive visualization with plotly (by percent)
visual_esrb_content_by_percent = plot_ly(esrb_content_by_year_tidy_percent, type = "bar", x = ~fct_reorder(esrb_content, percent), y = ~percent, frame = ~year_released, showlegend = FALSE)
visual_esrb_content_by_percent
#Platform Trends across year
summary(games$platform)
## 3DS Dreamcast DS Game Boy Advance
## 369 118 554 331
## GameCube Nintendo 64 PC PlayStation
## 400 70 4543 160
## PlayStation 2 PlayStation 3 PlayStation 4 PlayStation 5
## 1243 1190 1910 124
## PlayStation Vita PSP Stadia Switch
## 242 443 5 1220
## Wii Wii U Xbox Xbox 360
## 582 178 648 1511
## Xbox One Xbox Series X
## 1036 86
platform = games %>% count(year_released, platform) %>% na.omit()
platform_by_year = platform %>% group_by(year_released) %>% summarise(sum(n)) %>% rename(total_by_year = `sum(n)`)
platform = platform %>% left_join(platform_by_year, by="year_released")
platform$percent = platform$n / platform$total_by_year
platform = platform %>% select(-n, -total_by_year)
plot_ly(platform, type = "pie",labels = ~platform, values = ~percent, frame = ~year_released,
textinfo = 'label+percent')
empty_bar <- 10
# Add lines to the initial dataset
# to_add <- matrix(NA, empty_bar, ncol(a))
# colnames(to_add) <- colnames(a)
# a <- rbind(a, to_add)
# a$id <- seq(1, nrow(a))
#
# # Get the name and the y position of each label
# label_data <- a
# number_of_bar <- nrow(label_data)
# angle <- 90 - 360 * (label_data$id-0.5) /number_of_bar # I substract 0.5 because the letter must have the angle of the center of the bars. Not extreme right(1) or extreme left (0)
# label_data$hjust <- ifelse( angle < -90, 1, 0)
# label_data$angle <- ifelse(angle < -90, angle+180, angle)
#
# # Make the plot
# p <- ggplot(a, aes(x=as.factor(id), y=value)) + # Note that id is a factor. If x is numeric, there is some space between the first bar
# geom_bar(stat="identity", fill=alpha("green", 0.3)) +
# ylim(-100,120) +
# theme_minimal() +
# theme(
# axis.text = element_blank(),
# axis.title = element_blank(),
# panel.grid = element_blank(),
# plot.margin = unit(rep(-1,4), "cm")
# ) +
# coord_polar(start = 0) +
# geom_text(data=label_data, aes(x=id, y=value+10, label=individual, hjust=hjust), color="black", fontface="bold",alpha=0.6, size=2.5, angle= label_data$angle, inherit.aes = FALSE )
#
# p
#Correlation b/t metascore & userscore
Corr = cor(games$meta_score, games$user_score)
Corr
## [1] 0.5345749
#distribution user & metacritic scores
ggplot(games) + geom_histogram(mapping = aes(x = user_score))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(games) + geom_histogram(mapping = aes(x = meta_score))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.